import pandas as pd
import numpy as np
import random, os
# import torch
from typing import TypeVar, Tuple
import re
from copy import deepcopy
import plotly as py
import plotly.express as px
from plotly import graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()
# # seed 고정
# seed = 1514
# random.seed(seed)
# os.environ['PYTHONHASHSEED'] = str(seed)
# np.random.seed(seed)
# torch.manual_seed(seed)
# torch.cuda.manual_seed(seed)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = True
DataFrame = TypeVar('DataFrame')
# extract NER tag from sentence for label
def find_all_label(sentence: str) -> Tuple[np.array, np.array]:
# extract <aa:PS> shape string
sentence = re.findall( r'<.*?\:.*?>', sentence)
# remove '<', '>'
sentence = [i.replace('<','') for i in sentence]
sentence = [i.replace('>','') for i in sentence]
# name, entity split by ':'
before_split = deepcopy(sentence)
sentence = [i.split(':') for i in sentence]
sentence_array = np.array(sentence)
# create only name array
ner_array = sentence_array[...,0]
# create only entity array
label_array = sentence_array[...,1]
return before_split, ner_array, label_array
# remove NER tag from sentence for input
def remove_label(sentence: str) -> str:
# remove <aa:PS> shape string
sentence = re.sub(r':.*?>','', sentence)
sentence = re.sub(r'<','',sentence)
# remove all except for korean, english, number, etc..
sentence = re.sub(r'[^가-힣a-zA-Z0-9.%~]',' ' ,sentence)
# strip space
sentence = re.sub('[ ]+',' ',sentence)
sentence = sentence.strip()
return sentence
def get_data_from_txt(path: str) -> DataFrame:
# Get original train data from txt file
with open(path, 'r', encoding='UTF8') as f:
# because of useless spacing, last line remove
sentence_list = f.read().split('\n')[:-1]
df = pd.DataFrame({'sentence':sentence_list})
return df
def preprocess(df: DataFrame, train: bool=True):
if train == True:
df.sentence[10212] = df.sentence[10212].replace('일녀<QT>', '<일녀:QT>')
df['labels'] = df.sentence.apply(find_all_label)
df['train_label'] = df.labels.apply(lambda x: x[0])
df['name'] = df.labels.apply(lambda x: x[1])
df['entity'] = df.labels.apply(lambda x: x[2])
df['input_sentence'] = df.sentence.apply(remove_label)
df['joined_entity'] = df.entity.apply(lambda x: ' '.join(x))
return df
def get_train_df(path: str = '../genie_project/klue_ner_data/klue_ner_train_80.txt'):
df = get_data_from_txt(path)
preprocessed_df = preprocess(df=df, train=True)
return preprocessed_df
def get_test_df(path: str = '../genie_project/klue_ner_data/klue_ner_test_20.txt'):
df = get_data_from_txt(path)
preprocessed_df = preprocess(df=df, train=False)
return preprocessed_df
if __name__ == '__main__':
train_df = get_train_df()
test_df = get_test_df()
print(train_df.shape)
print(test_df.shape)
(20802, 7) (5201, 7)
# input sentence 길이 컬럼
train_df['input_len'] = train_df.input_sentence.str.len()
# input sentence 어절 수 컬럼
train_df['word_count'] = train_df.input_sentence.str.count(' ') + 1
# input sentence 음절 수 컬럼 (띄어쓰기는 포함하지 않음)
train_df['char_count'] = train_df.input_sentence.str.len() - train_df.input_sentence.str.count(' ')
# 문장별 name 갯수 컬럼
train_df['name_count'] = train_df.name.apply(len)
# 문장 평균 어절길이
train_df['input_word_avg'] = train_df['char_count'] / train_df.input_sentence.str.split().apply(len)
train_df.sample(n=5, random_state=10)
| sentence | labels | train_label | name | entity | input_sentence | joined_entity | input_len | word_count | char_count | name_count | input_word_avg | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 15658 | 그래플링 <9단:QT> 교섭가와 타격 <15단:QT> 주민들의 치열하고도 구수한 공... | ([9단:QT, 15단:QT, 터미네이터:PS], [9단, 15단, 터미네이터], ... | [9단:QT, 15단:QT, 터미네이터:PS] | [9단, 15단, 터미네이터] | [QT, QT, PS] | 그래플링 9단 교섭가와 타격 15단 주민들의 치열하고도 구수한 공방전 설득의 법칙은... | QT QT PS | 79 | 17 | 63 | 3 | 3.705882 |
| 757 | 아나 .정말 영화본시간이 아깝다 .대충 중간쯤 실험인거 알았는데 마지막은 정말 어처... | ([2탄:QT], [2탄], [QT]) | [2탄:QT] | [2탄] | [QT] | 아나 .정말 영화본시간이 아깝다 .대충 중간쯤 실험인거 알았는데 마지막은 정말 어처... | QT | 69 | 14 | 56 | 1 | 4.000000 |
| 1569 | 이 영화 평점 왜이리 낮게 평가 된건지 평점 맞추기 위해 <십점:QT>! | ([십점:QT], [십점], [QT]) | [십점:QT] | [십점] | [QT] | 이 영화 평점 왜이리 낮게 평가 된건지 평점 맞추기 위해 십점 | QT | 34 | 11 | 24 | 1 | 2.181818 |
| 7138 | 억지로이어서 나갈려고하니 내용이 자꾸만 산으로가지 <1편:QT>까지 손해본 영화! | ([1편:QT], [1편], [QT]) | [1편:QT] | [1편] | [QT] | 억지로이어서 나갈려고하니 내용이 자꾸만 산으로가지 1편까지 손해본 영화 | QT | 39 | 8 | 32 | 1 | 4.000000 |
| 13157 | 낭만적이고 그림 같은 영화. <일요일:DT> <오후:TI>에 EBS에서 봤던 추억이. | ([일요일:DT, 오후:TI], [일요일, 오후], [DT, TI]) | [일요일:DT, 오후:TI] | [일요일, 오후] | [DT, TI] | 낭만적이고 그림 같은 영화. 일요일 오후에 EBS에서 봤던 추억이. | DT TI | 37 | 9 | 29 | 2 | 3.222222 |
figure1 = px.histogram(train_df, x='input_len', marginal='box', width=900, height=400,
color_discrete_sequence=px.colors.qualitative.Pastel)
figure2 = px.histogram(train_df, x='word_count', marginal='box', width=900, height=400,
color_discrete_sequence=px.colors.qualitative.Pastel)
figure3 = px.histogram(train_df, x='char_count', marginal='box', width=900, height=400,
color_discrete_sequence=px.colors.qualitative.Pastel)
figure4 = px.histogram(train_df, x='name_count', marginal='box', width=900, height=400,
color_discrete_sequence=px.colors.qualitative.Pastel)
figure1.show()
figure2.show()
figure3.show()
figure4.show()
# pip install git+https://github.com/haven-jeon/PyKoSpacing.git
pip install --upgrade tensorflow==2.2.0
conda install tensorflow
import tensorflow as tf
tf.__version__
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_25728/3726339526.py in <module> ----> 1 import tensorflow as tf 2 tf.__version__ ~\AppData\Roaming\Python\Python37\site-packages\tensorflow\__init__.py in <module> 100 101 # We still need all the names that are toplevel on tensorflow_core --> 102 from tensorflow_core import * 103 104 # In V1 API we need to print deprecation messages ~\AppData\Roaming\Python\Python37\site-packages\tensorflow_core\__init__.py in <module> 34 from tensorflow._api.v1 import autograph 35 from tensorflow._api.v1 import bitwise ---> 36 from tensorflow._api.v1 import compat 37 from tensorflow._api.v1 import config 38 from tensorflow._api.v1 import data ~\AppData\Roaming\Python\Python37\site-packages\tensorflow_core\_api\v1\compat\__init__.py in <module> 21 import sys as _sys 22 ---> 23 from tensorflow._api.v1.compat import v1 24 from tensorflow._api.v1.compat import v2 25 from tensorflow.python.compat.compat import forward_compatibility_horizon ~\AppData\Roaming\Python\Python37\site-packages\tensorflow_core\_api\v1\compat\v1\__init__.py in <module> 670 _current_module = _sys.modules[__name__] 671 try: --> 672 from tensorflow_estimator.python.estimator.api._v1 import estimator 673 _current_module.__path__ = ( 674 [_module_util.get_parent_dir(estimator)] + _current_module.__path__) ~\anaconda3\lib\site-packages\tensorflow_estimator\__init__.py in <module> 8 import sys as _sys 9 ---> 10 from tensorflow_estimator._api.v1 import estimator 11 12 del _print_function ~\anaconda3\lib\site-packages\tensorflow_estimator\_api\v1\estimator\__init__.py in <module> 8 import sys as _sys 9 ---> 10 from tensorflow_estimator._api.v1.estimator import experimental 11 from tensorflow_estimator._api.v1.estimator import export 12 from tensorflow_estimator._api.v1.estimator import inputs ~\anaconda3\lib\site-packages\tensorflow_estimator\_api\v1\estimator\experimental\__init__.py in <module> 8 import sys as _sys 9 ---> 10 from tensorflow_estimator.python.estimator.canned.dnn import dnn_logit_fn_builder 11 from tensorflow_estimator.python.estimator.canned.kmeans import KMeansClustering as KMeans 12 from tensorflow_estimator.python.estimator.canned.linear import LinearSDCA ~\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py in <module> 31 from tensorflow.python.keras.utils import losses_utils 32 from tensorflow.python.util.tf_export import estimator_export ---> 33 from tensorflow_estimator.python.estimator import estimator 34 from tensorflow_estimator.python.estimator.canned import head as head_lib 35 from tensorflow_estimator.python.estimator.canned import optimizers ~\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py in <module> 51 from tensorflow_estimator.python.estimator import model_fn as model_fn_lib 52 from tensorflow_estimator.python.estimator import run_config ---> 53 from tensorflow_estimator.python.estimator import util as estimator_util 54 from tensorflow_estimator.python.estimator.export import export_lib 55 from tensorflow_estimator.python.estimator.mode_keys import ModeKeys ~\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\util.py in <module> 73 74 ---> 75 class _DatasetInitializerHook(tf.compat.v1.train.SessionRunHook): 76 """Creates a SessionRunHook that initializes the passed iterator.""" 77 AttributeError: module 'tensorflow' has no attribute 'compat'
from pykospacing import Spacing
spacing = Spacing()
# 형태소 분석기 사용
from konlpy.tag import Okt, Kkma, Komoran, Hannanum, Mecab
okt = Okt()
kkma = Kkma()
komoran = Komoran()
hannanum = Hannanum()
mecab = Mecab(dicpath=r"C:\mecab\mecab-ko-dic")
train_df['input_word_avg'] # 본래문장
본래문장에서 형태소 분석기 적용
# 띄어쓰기 없애기
def remove_space(sentence):
sentence = re.sub(r'\s', '', sentence)
return sentence
train_df['no_space'] = train_df.input.apply(remove_space)
# 글자 다 붙여서 -> mecab
# 글자 다 붙였다가 pykospacing 해서 -> mecab
# 띄어쓰기 보정 하지 않고 명사만 추출, 전체 품사 태깅
train_df['okt_nouns'] = train_df.input.apply(okt.nouns)
train_df['okt_pos'] = train_df.input.apply(okt.pos)
train_df['okt_morphs'] = train_df.input.apply(okt.morphs)
train_df['kkma_nouns'] = train_df.input.apply(kkma.nouns)
train_df['kkma_pos'] = train_df.input.apply(kkma.pos)
train_df['kkma_morphs'] = train_df.input.apply(okt.morphs)
train_df['komoran_nouns'] = train_df.input.apply(komoran.nouns)
train_df['komoran_pos'] = train_df.input.apply(komoran.pos)
train_df['komoran_morphs'] = train_df.input.apply(okt.morphs)
train_df['hannanum_nouns'] = train_df.input.apply(hannanum.nouns)
train_df['hannanum_pos'] = train_df.input.apply(hannanum.pos)
train_df['hannanum_morphs'] = train_df.input.apply(okt.morphs)
train_df['mecab_nouns'] = train_df.input.apply(mecab.nouns)
train_df['mecab_pos'] = train_df.input.apply(mecab.pos)
train_df['mecab_morphs'] = train_df.input.apply(okt.morphs)
# entity 통계를 위한 name 리스트
names_list = sum(train_df['name'].apply(lambda x : list(x)), [])
from pykospacing import Spacing
spacing = Spacing()
train_df['input_pykospacing'] = train_df.input.apply(spacing)
train_df['no_space_pykospacing'] = train_df.no_space.apply(spacing)
fig = plt.figure(figsize=(12,8))
av_per_essay = train_df['entity'].value_counts(ascending = True).
train_df['entity'].value_counts(ascending = True)1
# 기본 box plot
# fig = px.box(train_df, x="char_count")
# fig.show()
fig3d = px.scatter_3d(train_df, x='char_count', y='char_count_nospace', z='word_count')
fig3d.show()
count_fig = px.scatter(train_df, x='char_count', y = 'char_count_nospace', color='word_count')
count_fig.show()
fig, axs = plt.subplots(1, 2, figsize=(16, 6), gridspec_kw=dict(width_ratios=[4, 3]))
sns.histplot('char_count', binwidth=1, ax=axs[0])
sns.boxplot('char_count', ax=axs[1])
sum(j)
train_df.columns
# import plotly.graph_objects as go
x_data = ['input_len', 'word_count', 'char_count', 'name_count']
N = 50
y_data = [train_df['input_len'], train_df['word_count'], train_df['char_count'], train_df['name_count']]
colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)',
'rgba(255, 65, 54, 0.5)']
fig = go.Figure()
for xd, yd, cls in zip(x_data, y_data, colors):
fig.add_trace(go.Box(
y=yd,
name=xd,
boxpoints='all',
jitter=0.5,
whiskerwidth=0.2,
fillcolor=cls,
marker_size=2,
line_width=1)
)
fig.update_layout(
title='Lenght of the input_sentence and name of entity',
yaxis=dict(
autorange=True,
showgrid=True,
zeroline=True,
dtick=5,
gridcolor='rgb(255, 255, 255)',
gridwidth=1,
zerolinecolor='rgb(255, 255, 255)',
zerolinewidth=2,
),
margin=dict(
l=40,
r=30,
b=80,
t=100,
),
paper_bgcolor='rgb(243, 243, 243)',
plot_bgcolor='rgb(243, 243, 243)',
showlegend=False
)
fig.show(
특성 x1과 x2의 유사도를 기반으로
train_df
# %%
train_df['before_label'] = train_df.sentence.apply(find_all_label)
# %%
train_df['input_token'] = train_df.sentence.apply(lambda x: tokenizer(x)['input_ids'])
train_df['input_len'] = train_df.input_token.apply(len)
# %%
import plotly.express as px
train_count_df = train_df.groupby('input_len').count()
px.bar(train_count_df, x=train_count_df.index, y=train_count_df.input_token, color=train_count_df.index)
# %%
with tokenizer.as_target_tokenizer():
train_df['label_token'] = train_df['before_label'].apply(lambda x: tokenizer(x)['input_ids'])
train_df['label_token_len'] = train_df.label_token.apply(len)
# %%
train_df.label_token_len.quantile(0.95)
# %%
label_count_df = train_df.groupby('label_token_len').count()
# %%
px.bar(label_count_df, x=label_count_df.index, y=label_count_df.label_token, color=label_count_df.index)
# %%
input_max_len = 84
label_max_len = 64
train_df.to_csv("konlpy_tagged_basic.csv", encoding='utf-8-sig')
def n_grams(tokens, n):
"""
If length of tokens less then n, return empty list
"""
l = len(tokens)
return [tuple(tokens[i:i + n]) for i in range(l) if i + n <= l]
print(n_grams(['건너뛰기'], 2))
print(n_grams(['건너뛰기', 'ad', 'sdf'], 2))
주파수 기반 임베딩이란 단어가 나오는 횟수를 기준으로 인베딩을 하는 것이다. 횟수만 따진다는 것은 문맥을 따지지 않는다는 것을 의미한다. 즉, 간단한 자연어 처리에서는 많이 사용하지만, 정확한 예측은 힘들다는 것을 인지하자.
ftr_vect = tfidf_vectorizer.fit(train_df['input']) #리스트로 입력
print(ftr_vect)
print(tfidf_vectorizer.vocabulary_) # 단어 사전 목록보기
# 1,2gram적용, 빈도수 0.05이하, 0.85이상의 빈도수 단어들 제거
# tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=0.05, max_df=0.85)
train_df
kkma_nouns_list = [' '.join(nouns) for nouns in train_df['kkma_nouns']]
hannanum_nouns_list = [' '.join(nouns) for nouns in train_df['hannanum_nouns']]
komoran_nouns_list = [' '.join(nouns) for nouns in train_df['komoran_nouns']]
mecab_nouns_list = [' '.join(nouns) for nouns in train_df['mecab_nouns']]
okt_nouns_list = [' '.join(nouns) for nouns in train_df['okt_nouns']]
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(binary=True, max_features=25000)
kkma_embeddings = tfidf_vectorizer.fit_transform(kkma_nouns_list).toarray()
hannanum_embeddings = tfidf_vectorizer.fit_transform(hannanum_nouns_list).toarray()
komoran_embeddings = tfidf_vectorizer.fit_transform(komoran_nouns_list).toarray()
mecab_embeddings = tfidf_vectorizer.fit_transform(mecab_nouns_list).toarray()
okt_embeddings = tfidf_vectorizer.fit_transform(okt_nouns_list).toarray()
kkma_embed_2d, hannanum_embed_2d, komoran_embed_2d, mecab_embed_2d, okt_embed_2d
# # kmeans 간단한 테스트
# from sklearn.cluster import KMeans
# kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=300, random_state=1514, )
# kmeans.fit(embed_2d)
# train_df['cluster'] = kmeans.labels_
# # 실루엣 스코어 측정(높을수록 해당 cluster가 유의함)
# from sklearn.metrics import silhouette_samples
# sil_score = silhouette_samples(embed_2d, train_df.cluster)
# # %%
# np.mean(sil_score)
# 실루엣 스코어를 측정하고, 차트를 그려주는 함수
def visualize_silhouette(cluster_lists, X_features):
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import math
from tqdm import tqdm
# 입력값으로 클러스터링 갯수들을 리스트로 받아서, 각 갯수별로 클러스터링을 적용하고 실루엣 개수를 구함
n_cols = len(cluster_lists)
# plt.subplots()으로 리스트에 기재된 클러스터링 수만큼의 sub figures를 가지는 axs 생성
fig, axs = plt.subplots(figsize=(4*n_cols, 4), nrows=1, ncols=n_cols)
# 리스트에 기재된 클러스터링 갯수들을 차례로 iteration 수행하면서 실루엣 개수 시각화
for ind, n_cluster in enumerate(tqdm(cluster_lists,desc='kmeans fitting')):
# KMeans 클러스터링 수행하고, 실루엣 스코어와 개별 데이터의 실루엣 값 계산.
clusterer = KMeans(n_clusters = n_cluster, max_iter=500, random_state=0)
cluster_labels = clusterer.fit_predict(X_features)
sil_avg = silhouette_score(X_features, cluster_labels)
sil_values = silhouette_samples(X_features, cluster_labels)
y_lower = 10
axs[ind].set_title('Number of Cluster : '+ str(n_cluster)+'\n' \
'Silhouette Score :' + str(round(sil_avg,3)) )
axs[ind].set_xlabel("The silhouette coefficient values")
axs[ind].set_ylabel("Cluster label")
axs[ind].set_xlim([-0.1, 1])
axs[ind].set_ylim([0, len(X_features) + (n_cluster + 1) * 10])
axs[ind].set_yticks([]) # Clear the yaxis labels / ticks
axs[ind].set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])
# 클러스터링 갯수별로 fill_betweenx( )형태의 막대 그래프 표현.
for i in tqdm(range(n_cluster),desc='plotting'):
ith_cluster_sil_values = sil_values[cluster_labels==i]
ith_cluster_sil_values.sort()
size_cluster_i = ith_cluster_sil_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_cluster)
axs[ind].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_sil_values, \
facecolor=color, edgecolor=color, alpha=0.7)
axs[ind].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
axs[ind].axvline(x=sil_avg, color="red", linestyle="--")
# 클러스터 갯수 설정 리스트로 변경 가능
cluster_list = [2,3,4,5,6,7]
# TSNE로 차원축소하면 값이 2차원 (x, y)
from sklearn.manifold import TSNE
t_sne = TSNE()
kkma_embed_2d = t_sne.fit_transform(kkma_embeddings)
# 분석기 지정해주기
embed_2d = kkma_embed_2d
visualize_silhouette(cluster_list, embed_2d)
# TSNE로 차원축소하면 값이 2차원 (x, y)
from sklearn.manifold import TSNE
t_sne = TSNE()
hannanum_embed_2d = t_sne.fit_transform(hannanum_embeddings)
# 분석기 지정해주기
embed_2d = hannanum_embed_2d
visualize_silhouette(cluster_list, embed_2d)
# TSNE로 차원축소하면 값이 2차원 (x, y)
from sklearn.manifold import TSNE
t_sne = TSNE()
komoran_embed_2d = t_sne.fit_transform(komoran_embeddings)
# 분석기 지정해주기
embed_2d = komoran_embed_2d
visualize_silhouette(cluster_list, embed_2d)
# TSNE로 차원축소하면 값이 2차원 (x, y)
from sklearn.manifold import TSNE
t_sne = TSNE()
mecab_embed_2d = t_sne.fit_transform(mecab_embeddings)
# 분석기 지정해주기
embed_2d = mecab_embed_2d
visualize_silhouette(cluster_list, embed_2d)
# TSNE로 차원축소하면 값이 2차원 (x, y)
from sklearn.manifold import TSNE
t_sne = TSNE()
okt_embed_2d = t_sne.fit_transform(okt_embeddings)
# 분석기 지정해주기
embed_2d = okt_embed_2d
visualize_silhouette(cluster_list, embed_2d)
# K-means로 3개 군집으로 문서 군집화시키기
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, max_iter=10000, random_state=42)
# 비지도 학습이니 feature로만 학습시키고 예측
cluster_label = kmeans.fit_predict(ftr_vect)
# 군집화한 레이블값들을 document_df 에 추가하기
train_df['cluster_label'] = cluster_label
# print(train_df.sort_values(by=['cluster_label']))
print(tfidf_matrix.toarray())
sectence = train_df['input'][0]
print(tfidf_vectorizer.transform(train_df['input']).toarray())
tfidf_vectorizer.fit(train_df['input']) #리스트로 입력
print(tfidf_vectorizer.vocabulary_) # 단어 사전 목록보기
tfidf_vectorizer.transform(train_df['input']).toarray()
def tokenize_by_morpheme_char(s):
return mecab.morphs(s)
def tokenize_by_morpheme_jaso(s):
return [to_jaso(token) for token in tokenize_by_morpheme_char(s)]
# words 를 n-grams 으로 변형하는 to_ngrams 함수
def to_ngrams(words, n):
ngrams = []
for b in range(0, len(words) - n + 1):
ngrams.append(tuple(words[b:b+n]))
return ngrams
train_df
import pandas as pd
df = pd.DataFrame({'X': [1, 2, 3,],
'Y': [4, 1, 8]})
replaced_df=df.replace(1, 5)
replaced_df